import os
import tempfile
import requests
import PyPDF2
from datasets import Dataset
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


def download_pdf_to_temp(pdf_url):
    """
    Downloads a PDF from a URL to a temporary file and returns its path.
    """
    tmp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()
        for chunk in response.iter_content(chunk_size=8192):
            tmp_file.write(chunk)
        tmp_file.flush()
        return tmp_file.name
    finally:
        tmp_file.close()


def extract_text_per_page(pdf_filename):
    """
    Extracts text from each page of a PDF file.
    Returns a list of tuples: (page_number, text).
    """
    texts = []
    try:
        with open(pdf_filename, "rb") as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page_num, page in enumerate(reader.pages, start=1):
                text = page.extract_text()
                texts.append((page_num, text if text else ""))
    except Exception as e:
        print(f"Error extracting text from {pdf_filename}: {e}")
    return texts


def process_pdf(pdf_path, source, rows):
    """
    Processes a PDF file (URL or local), extracts text, and stores it in the dataset.
    """
    temp_pdf = None
    try:
        # If the source is a URL, download to a temp file
        if source == "url":
            temp_pdf = download_pdf_to_temp(pdf_path)
            pdf_path = temp_pdf  # Now treat it as a local file

        # Extract text per page
        page_texts = extract_text_per_page(pdf_path)

        # Append results to the shared rows dictionary
        for page_number, text in page_texts:
            rows.append(
                {
                    "pdf_path": pdf_path,
                    "source": source,
                    "page": page_number,
                    "text": text,
                }
            )
    except Exception as e:
        print(f"Failed to process {pdf_path}: {e}")
    finally:
        # Cleanup temp file if used
        if temp_pdf and os.path.exists(temp_pdf):
            os.remove(temp_pdf)


def main(pdf_sources):
    """
    Processes a list of PDF sources (URLs or local folder paths) concurrently.
    """
    # Collect all PDF paths
    pdf_list = []

    for source in pdf_sources:
        if source.startswith("http"):  # URL
            pdf_list.append((source, "url"))
        elif os.path.isdir(source):  # Local folder
            for file in os.listdir(source):
                if file.endswith(".pdf"):
                    pdf_list.append((os.path.join(source, file), "local"))
        elif os.path.isfile(source) and source.endswith(".pdf"):  # Single PDF file
            pdf_list.append((source, "local"))

    print(f"Found {len(pdf_list)} PDFs to process.")

    # Shared dataset rows
    rows = []

    # Process PDFs concurrently
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        future_to_pdf = {
            executor.submit(process_pdf, pdf, src, rows): pdf for pdf, src in pdf_list
        }

        for future in tqdm(
            as_completed(future_to_pdf),
            total=len(future_to_pdf),
            desc="Processing PDFs",
        ):
            future.result()  # Ensures any raised exceptions are caught

    # Create Hugging Face dataset
    hf_dataset = Dataset.from_list(rows)
    print(f"Dataset created with {len(rows)} rows.")

    # Push to Hugging Face Hub
    try:
        hf_dataset.push_to_hub("your-username/your-dataset-name")
        print("Dataset successfully pushed to Hugging Face Hub!")
    except Exception as e:
        print("Failed to push dataset:", e)


def main():
    # List of PDF URLs to process
    good_pdf_urls = [
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2010.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2009.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2008.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2007.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-solution-2011.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2010.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2009.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2008.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/chem-13-news-exam-solution-2007.pdf",
        "https://www.press.muni.cz/media/3019066/answers_to_all_questions.pdf",
        "https://public.wsu.edu/~thorglab/biol301/exams/finalkey.pdf",
        "https://people.bu.edu/msoren/BI515_2014/Exam1key.pdf",
        "https://facultystaff.richmond.edu/~lrunyenj/bio201/04bio201%20exam%201%20key.pdf",
        "https://www.usabo-trc.org/sites/default/files/images/pdf/exams/semifinal-answers/2011-semifinal-answers.pdf",
        "https://sites.science.oregonstate.edu/chemistry/courses/ch411/restrict2/ch411%20F11%20final%20key%20.pdf",
        "https://www.mit.edu/~anugrah/files/MockIChOSolutions.pdf",
        "https://www.mit.edu/~anugrah/files/2012CChOLocalSoln.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2022/11/CCC-PtA-2022-ENG-final-ANSWERS.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2022/12/Canadian-Chemistry-Olympiad-2022-EN_key.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2022/01/CCC-PtA-2021-ENG-ANSWERS.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2021/01/CCC-PtA-2020-ENG-ANSWERS-revised-COVID19.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2019-PtA-answers.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2017-PtA-answers-EN.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2018-PtA-answers-EN.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2016-PtA-answers-EN.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2015-PtA-answers-EN.pdf",
        "https://www.cheminst.ca/wp-content/uploads/2019/11/CCC-2014-PtA-answers-EN.pdf",
        "https://www.ttcho.com/_files/ugd/988b76_e3d21fc42bb24de4959d7250f0cfcb3a.pdf",
        "https://www.ttcho.com/_files/ugd/988b76_01ceeff230b24cbbb0125b2bfa3f3475.pdf",
        "https://www.ttcho.com/_files/ugd/988b76_48944f6ace684143bfdc9080fca59862.pdf",
        "https://www.ttcho.com/_files/ugd/988b76_ba0cb3177d05436da273a400a037ed01.pdf",
        "http://chemistryrace.com/wp-content/uploads/2025/02/ChR_2025_answer_booklet.pdf",
        "http://chemistryrace.com/wp-content/uploads/2024/02/ChR_2024_answer_booklet.pdf",
        "http://chemistryrace.com/wp-content/uploads/2023/02/chemistry_race_2023_answers-book.pdf",
        "http://chemistryrace.com/wp-content/uploads/2022/02/cambridge_chemistry_race_2022_answers.pdf",
        "http://chemistryrace.com/wp-content/uploads/2021/02/Chemistry_Race_2021_Questions_and_Solutions.pdf",
        "https://chemistryrace.soc.srcf.net/wp-content/uploads/2020/02/Chemistry_Race_2020_Questions_and_Solutions.pdf",
        "https://biolympiads.com/wp-content/uploads/2015/01/2003_OpenExam_AnswerKey2.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2004_OpenExam_AnswerKey4.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2005_OpenExam_AnswerKey1.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2006_OpenExam_AnsKey2.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2007_OpenExam_AnsKey1.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2008_OpenExam_AnsKey1.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2009_OpenExam_AnsKey1.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2010_OpenExam_AnsKey3.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2011_OpenExam_AnsKey.pdf",
        "http://biolympiads.com/wp-content/uploads/2015/01/2012_OpenExam_AnsKey2.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2024/02/ASOE-Chemistry-2023-Exam-Paper-with-Answers.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2023/05/ASOE-Chemistry-2022-ASDAN-answers.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2023/05/ASOE_Chemistry_2021_answers_reduced-FS.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2020-asoe-chemistry-exam-answers.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2019-asoe-chemistry-exam-answers.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2018-asoe-chemistry-exam-answers.pdf",
        "https://www.andrews.edu/~rwright/physics/OpenStax%20Physics-Student%20Solution%20Manual.pdf",
        "https://reanphysics.wordpress.com/wp-content/uploads/2018/11/raymond_a-_serway_john_w-_jewett_student_solutibookzz-org.pdf",
        "https://carrollscaveofknowledge.weebly.com/uploads/2/0/2/8/20287891/physics_11_regular_year_solutions_manual.pdf",
        "https://doctor2019.jumedicine.com/wp-content/uploads/sites/10/2019/09/Giancoli-Physics-Principles-With-Applications-7th-c2014-solutions-ISM.pdf",
        "https://ia801305.us.archive.org/8/items/ProblemsInCalculusOfOneVariableI.A.Maron/Problems%20in%20Calculus%20of%20One%20Variable%20-%20I.%20A.%20Maron.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2017-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2016-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2015-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2014-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2013-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2012-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2011-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2010-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2009-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2008-asoe-chemistry-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2007-asoe-chemistry-exam.pdf",
        "https://www.arml.com/ARML/arml_2019/public_contest_files/2023_contest_file/ARML_2023Contest.pdf",
        "https://jeeadv.ac.in/past_qps/2007_1.pdf",
        "https://jeeadv.ac.in/past_qps/2007_2.pdf",
        "https://jeeadv.ac.in/past_qps/2008_1.pdf",
        "https://jeeadv.ac.in/past_qps/2008_2.pdf",
        "https://jeeadv.ac.in/past_qps/2009_1.pdf",
        "https://jeeadv.ac.in/past_qps/2009_2.pdf",
        "https://jeeadv.ac.in/past_qps/2010_1.pdf",
        "https://jeeadv.ac.in/past_qps/2010_2.pdf",
        "https://jeeadv.ac.in/past_qps/2011_1.pdf",
        "https://jeeadv.ac.in/past_qps/2011_2.pdf",
        "https://jeeadv.ac.in/past_qps/2012_1.pdf",
        "https://jeeadv.ac.in/past_qps/2012_2.pdf",
        "https://jeeadv.ac.in/past_qps/2013_1.pdf",
        "https://jeeadv.ac.in/past_qps/2013_2.pdf",
        "https://jeeadv.ac.in/past_qps/2014_1.pdf",
        "https://jeeadv.ac.in/past_qps/2014_2.pdf",
        "https://jeeadv.ac.in/past_qps/2015_1.pdf",
        "https://jeeadv.ac.in/past_qps/2015_2.pdf",
        "https://jeeadv.ac.in/past_qps/2017_1.pdf",
        "https://jeeadv.ac.in/past_qps/2017_2.pdf",
        "https://jeeadv.ac.in/past_qps/2018_1.pdf",
        "https://jeeadv.ac.in/past_qps/2018_2.pdf",
        "https://jeeadv.ac.in/past_qps/2019_1_English.pdf",
        "https://jeeadv.ac.in/past_qps/2019_2_English.pdf",
        "https://jeeadv.ac.in/past_qps/2020_1_English.pdf",
        "https://jeeadv.ac.in/past_qps/2020_2_English.pdf",
        "https://jeeadv.ac.in/past_qps/2021_1_English.pdf",
        "https://jeeadv.ac.in/past_qps/2021_2_English.pdf",
        "https://jeeadv.ac.in/past_qps/2022_1_English.pdf",
        "https://jeeadv.ac.in/past_qps/2022_2_English.pdf",
        "https://jeeadv.ac.in/past_qps/2023_1_English.pdf",
        "https://jeeadv.ac.in/past_qps/2023_2_English.pdf",
        "https://jeeadv.ac.in/past_qps/2024_1_English.pdf",
        "https://jeeadv.ac.in/past_qps/2024_2_English.pdf",
        "https://ia600608.us.archive.org/19/items/IrodovProblemsInGeneralPhysics/Irodov-Problems_in_General_Physics.pdf",
        "https://ia601305.us.archive.org/8/items/ProblemsInCalculusOfOneVariableI.A.Maron/Problems%20in%20Calculus%20of%20One%20Variable%20-%20I.%20A.%20Maron.pdf",
        "https://blogmedia.testbook.com/kmat-kerala/wp-content/uploads/2023/06/physical-chemistry-by-p-bahadur-5113ed32.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2023-ASOE-Physics-Past-Paper-ASI.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2022-ASOE-Physics.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2021-ASOE-Physics.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2020-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2019-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2017-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2016-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2015-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2014-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2013-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2012-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2011-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2010-Physics-NQE-paper-FINAL.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2010-Physics-NQE-paper-FINAL.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2008-asoe-physics-exam.pdf",
        "https://www.asi.edu.au/wp-content/uploads/2022/12/2007-asoe-physics-exam.pdf",
        "http://algorithmics.lsi.upc.edu/docs/Dasgupta-Papadimitriou-Vazirani.pdf",
        "https://biocomp.utoronto.ca/files/2024/05/Exam2024.pdf",
        "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2019.pdf",
        "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2018_0.pdf",
        "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2017.pdf",
        "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2016.pdf",
        "https://biocomp.utoronto.ca/files/2023/10/biocomp-exam-2015.pdf",
        "https://www.andrews.edu/~rwright/physics/OpenStax%20Physics-Student%20Solution%20Manual.pdf",
        "https://reanphysics.wordpress.com/wp-content/uploads/2018/11/raymond_a-_serway_john_w-_jewett_student_solutibookzz-org.pdf",
        "https://carrollscaveofknowledge.weebly.com/uploads/2/0/2/8/20287891/physics_11_regular_year_solutions_manual.pdf",
        "https://doctor2019.jumedicine.com/wp-content/uploads/sites/10/2019/09/Giancoli-Physics-Principles-With-Applications-7th-c2014-solutions-ISM.pdf",
        "http://lib.ysu.am/disciplines_bk/ea4b336028cd91ba7265865d8fde153c.pdf",
        "https://www.sfcollege.edu/_media/Assets/sf/placement/files/Chemistry%20Practice%20Placement%20Exam.pdf",
        "https://www.pvamu.edu/chemistry/wp-content/uploads/sites/26/_01_pract_Test.pdf",
        "https://sccollege.edu/students/studentservices/counseling/Shared%20Documents/Chem%20Placement%20Practice%20Exam%202023.pdf",
        "https://www.csn.edu/sites/default/files/documents/department-documents/department-of-physical-sciences/chemplacementtestpractice.pdf",
        "https://www.mc3.edu/admissions/applying-to-mccc/testing-and-assessment/assets/biology-placement-test.pdf",
    ]

    bad_pdf_urls = ["~/Downloads/0000"]

    # This will hold our dataset rows
    rows = {"pdf_url": [], "page": [], "text": []}

    for pdf_url in bad_pdf_urls:
        print(f"Processing {pdf_url}")
        try:
            # Download PDF to a temporary file
            temp_pdf = download_pdf_to_temp(pdf_url)

            # Extract text from each page
            page_texts = extract_text_per_page(temp_pdf)

            # Create a row in the dataset for each page
            for page_number, text in page_texts:
                rows["pdf_url"].append(pdf_url)
                rows["page"].append(page_number)
                rows["text"].append(text)
        except Exception as e:
            print(f"Error processing {pdf_url}: {e}")
        finally:
            # Always remove the temporary file if it exists
            if os.path.exists(temp_pdf):
                os.remove(temp_pdf)

    # Create a Hugging Face dataset from the collected rows
    hf_dataset = Dataset.from_dict(rows)
    print("Dataset created with", hf_dataset.num_rows, "rows.")

    # Push the dataset to the Hugging Face Hub.
    # Replace "your-username/your-dataset-name" with your desired repo name.
    try:
        hf_dataset.push_to_hub("Zaynes/bad_pdf_text")
        print("Dataset successfully pushed to the Hugging Face Hub!")
    except Exception as e:
        print("Failed to push dataset to the hub:", e)


if __name__ == "__main__":
    main()
